# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#%config InlineBackend.figure_format = 'retina'
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
# Import KNN Regressor machine learning library
from sklearn.neighbors import KNeighborsRegressor
# Import Decision Tree Regressor machine learning library
from sklearn.tree import DecisionTreeRegressor
# Import ensemble machine learning library
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor,AdaBoostRegressor,BaggingRegressor)
# Import support vector regressor machine learning library
from sklearn.svm import SVR
#Import the metrics
from sklearn import metrics
#Import the Voting regressor for Ensemble
from sklearn.ensemble import VotingRegressor
# Import stats from scipy
from scipy import stats
#importing the metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
#importing the K fold
from sklearn.model_selection import KFold
#importing the cross validation score
from sklearn.model_selection import cross_val_score
#importing the preprocessing library
from sklearn import preprocessing
# importing the Polynomial features
from sklearn.preprocessing import PolynomialFeatures
#importing kmeans clustering library
from sklearn.cluster import KMeans
from sklearn.utils import resample
# import data
df=pd.read_csv("concrete (1).csv")
df.shape
# get info on data avialable and compare it with details provided about data set
df.info()
#Analyze distribution
df.describe().T
#view the top 5 rows of data
df.head()
#view bottom 5 rows of data
df.tail()
# view random 10 data from dataset
df.sample(10)
fig, axs = plt.subplots(nrows = 9, ncols=2, figsize = (10,30))
for i, x in enumerate(df.columns):
sns.distplot(df[x],ax=axs[i,0],bins=50, rug=True)
sns.boxplot(orient='v', data=df[x], ax=axs[i, 1])
fig.tight_layout()
plt.show()
plt.clf()
plt.close()
#for colname in df.columns:
# sns.distplot(df[colname])
df.boxplot(figsize=(15, 10));
sns.pairplot(df, diag_kind='kde');
df.hist(bins=30,figsize=(25, 10));
plt.figure(figsize=(15, 10))
sns.heatmap(df.corr(),annot=True,cmap="cividis",linecolor="black" );
df.corr()['strength'].sort_values()
sns.lmplot(x="cement",y="strength",data=df)
plt.show()
#cement vs water
sns.lmplot(x="cement",y="water",data=df)
plt.show()
sns.lmplot(x="superplastic",y="water",data=df)
plt.show()
sns.lmplot(x="ash",y="strength",data=df)
plt.show()
df.isnull().sum()
df.boxplot(figsize=(10, 10));
# NUmber of outliers
for cols in df.columns[:-1]:
q1=df[cols].quantile(0.25)
q3=df[cols].quantile(0.75)
iqr=q3-q1
low=q1-1.5*iqr
high=q3+1.5*iqr
print('Outliers count for',cols, df.loc[((df[cols]>high) | (df[cols]<low)),cols].count())
print('high value Outliers count for',cols, df.loc[(df[cols]>high),cols].count())
print('low value Outliers count for',cols, df.loc[(df[cols]<low),cols].count())
#Records containing outliers
for cols in df.columns[:-1]:
q1=df[cols].quantile(0.25)
q3=df[cols].quantile(0.75)
iqr=q3-q1
low=q1-1.5*iqr
high=q3+1.5*iqr
print()
print('Outliers count for',cols)
print(df.loc[((df[cols]>high) | (df[cols]<low)),:])
#- selecting all but leaving out the last columns whihc is our target
for cols in df.columns[:-1]:
q1=df[cols].quantile(0.25)
q3=df[cols].quantile(0.75)
iqr=q3-q1
low=q1-1.5*iqr
high=q3+1.5*iqr
df.loc[(df[cols]<low),cols]=low
df.loc[(df[cols]>high),cols]=high
#Rechecking the outliers and qartiles
df.boxplot(figsize=(10, 8));
#to capture if we are getting any additional peaks or bulges on the due to outlier treatments
sns.pairplot(df, diag_kind='kde');
#Scaling data set
df_scaled = preprocessing.scale(df)
df_scaled=pd.DataFrame(df_scaled,columns=df.columns)
df_scaled.describe().T
#spliting dependent and independent variable
X=df_scaled.iloc[:,0:8]
y = df_scaled.iloc[:,8]
# Training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 50)
# OLS - LinearRegression
# We have noticed earlier that all the features were not very strong predictors of strength (max corr was 66% with cement)
# we are expecting our Linear model to be a weak predictor
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))
intercept = regression_model.intercept_
print("\nThe intercept for our model is {}".format(intercept))
# predict mileage (mpg) for a set of attributes not in the training or test set
y_pred = regression_model.predict(X_test)
# Since this is regression, plot the predicted y value vs actual y values for the test data
# A good model's prediction will be close to actual leading to high R and R2 values
plt.scatter(y_test, y_pred);
#Ridge
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
#print ("Ridge model:", (ridge.coef_))
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, ridge.coef_[idx]))
#Lasso
lasso = Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
#print ("Lasso model:", (lasso.coef_))
featureAnalysisLasso=lasso.coef_
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, featureAnalysisLasso[idx]))
#we will use the lasso coeff details later as well for feature selecction
#Comparing all linear regression details
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))
#Polynomial approach and complexity
from sklearn.preprocessing import PolynomialFeatures
#going with degree 2 only
poly = PolynomialFeatures(degree = 2, interaction_only=True)
#poly = PolynomialFeatures(2)
#X is already scaled and will use it
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.30, random_state=50)
X_train.shape
regression_model.fit(X_train, y_train)
print(regression_model.coef_)
print(regression_model.score(X_train, y_train))
print(regression_model.score(X_test, y_test))
ridge = Ridge(alpha=.3)
ridge.fit(X_train,y_train)
print ("Ridge model:", (ridge.coef_))
print(ridge.score(X_train, y_train))
print(ridge.score(X_test, y_test))
lasso = Lasso(alpha=0.01)
lasso.fit(X_train,y_train)
print ("Lasso model:", (lasso.coef_))
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))
sns.pairplot(df_scaled,diag_kind='kde')
df_scaled.corr().strength.sort_values()
# Trying to see how many groups we can capture with kmeans & GaussianMixture models
from sklearn.cluster import KMeans
cluster_range=range(1,15)
cluster_error=[]
for num_clusters in cluster_range:
cluster=KMeans(num_clusters,n_init=20)
cluster.fit(temp)
labels=cluster.labels_
centroid=cluster.cluster_centers_
cluster_error.append(cluster.inertia_)
clusters_df = pd.DataFrame({"num_clusters": cluster_range, "cluster_errors": cluster_error})
clusters_df[0:15]
from matplotlib import cm
import matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
plt.plot(clusters_df.num_clusters, clusters_df.cluster_errors, marker = "X")
from copy import deepcopy
temp=deepcopy(df_scaled.drop(columns=['strength'], axis=1))
temp2=deepcopy(df_scaled)
#Splitting the data in 2 set
#pandas will cut the data in 2 ranges and convert the continuous data to categorical for this purpose
val2=pd.cut(temp2.strength, bins=2, labels=np.arange(2), right=False)
# training gaussian mixture model
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2, random_state=50)
gmm.fit(temp)
#predictions from gmm
labels = gmm.predict(temp)
frame = pd.DataFrame(temp)
frame['cluster'] = labels
frame['stregth_cluster']=val2
frame.columns = ['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg',
'fineagg', 'age', 'cluster','stregth_cluster']
color=['blue','green','cyan', 'black','red']
for k in range(0,2):
data = frame[frame["cluster"]==k]
plt.scatter(data["cement"],data["water"],c=color[k])
plt.show()
#plotting the clusters to identify if cluters created from modela nd clusters created in data are any match
#plot with split as per model prediction
for k in range(0,2):
data = frame[frame["stregth_cluster"]==k]
plt.scatter(data["cement"],data["water"],c=color[k])
plt.show()
#Plot with real split with strength
frame.head(10)
from IPython.display import display
ct = pd.crosstab(frame['cluster'], frame['stregth_cluster'])
display(ct)
#Feature selection
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
# Build Lin Reg to use in feature selection
linR = LinearRegression()
# Training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 50)
# Build step forward feature selection
# we have choosen 5 as from our earlier exploration with lasso and feature analysis
sfs1 = sfs(linR, k_features=5, forward=True, scoring='r2', cv=5)
# Perform SFFS
sfs1 = sfs1.fit(X_train.values, y_train.values)
sfs1.get_metric_dict()
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
fig = plot_sfs(sfs1.get_metric_dict())
plt.title('Sequential Forward Selection (w. R^2)')
plt.grid()
plt.show()
# Which features?
columnList = list(X_train.columns)
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)
subsetColumnList = [columnList[i] for i in feat_cols]
print(subsetColumnList)
#Lasso details for feature
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, featureAnalysisLasso[idx]))
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 50)
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train , y_train)
#printing the feature importance
print('Feature importances: \n',pd.DataFrame(dt_model.feature_importances_,columns=['Imp'],index=X_train.columns))
modelComp=pd.DataFrame()
X=df_scaled.iloc[:,0:8]
y = df_scaled.iloc[:,8]
seed=50
num_folds = 50
#Removing less contributing features
X=X.drop(['ash','coarseagg','fineagg'],axis=1)
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = seed)
X.columns
#Decission tree Regreesor with default parameters and defaul values
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train , y_train)
y_pred = dt_model.predict(X_test)
# Train data accuracy
print('Performance on training data using DT:',dt_model.score(X_train,y_train))
# test data accuracy
print('Performance on testing data using DT:',dt_model.score(X_test,y_test))
#r2 score
acc_DT=metrics.r2_score(y_test, y_pred)
print('Accuracy Test: ',acc_DT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
from scipy.stats import pearsonr
sns.jointplot(x=y_test, y=y_pred, stat_func=pearsonr,kind="reg");
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
model1 = dt_model
results = cross_val_score(model1, X_train, y_train, cv=kfold)
print(results)
print("\n Average model Accuracy: %.3f%% with std. dev - (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
#Regularizing Decission Tree with diff values
model = DecisionTreeRegressor( max_depth = 8,random_state=seed,min_samples_leaf=4)
model.fit(X_train , y_train)
y_pred = model.predict(X_test)
# Train data accuracy
print('Performance on training data using DT:',model.score(X_train,y_train))
# test data accuracy
print('Performance on testing data using DT:',model.score(X_test,y_test))
#r2 score
acc_DT=metrics.r2_score(y_test, y_pred)
print('Accuracy Test: ',acc_DT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
#model1 = model
results = cross_val_score(model, X_train, y_train, cv=kfold)
print(results)
print("\n Average model Accuracy: %.3f%% with std. dev - (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
modelComp=modelComp.append(pd.DataFrame({'Model':['Decission Tree'],
'Train Accuracy':[model.score(X_train,y_train)],
'Test Accuracy':[model.score(X_test , y_test)],
'Kfold-Mean-Accuracy':[results.mean()],
'Kfold-StdDeviation':[results.std()]}))
#Random FOrest With default config
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Train data accuracy
print('Performance on training data using DT:',model.score(X_train,y_train))
# test data accuracy
print('Performance on testing data using DT:',model.score(X_test,y_test))
#r2 score
acc_DT=metrics.r2_score(y_test, y_pred)
print('Accuracy Test: ',acc_DT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
#model1 = model
results = cross_val_score(model, X_train, y_train, cv=kfold)
print(results)
print("\n Average model Accuracy: %.3f%% with std. dev - (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
from sklearn.model_selection import GridSearchCV
parameters = {'bootstrap': [True],
'max_depth': [5, 10, 15, 20, 25, 30, 35, 40, 50],
'max_features': ['auto', 'sqrt'],
'min_samples_leaf': [1, 2, 4, 8],
'n_estimators': [100]}
clf = GridSearchCV(RandomForestRegressor(),
parameters,
cv = 5,
verbose = 2,
n_jobs= 4)
clf.fit(X_train, y_train)
clf.best_params_
#Using the grid search is giving us better train data accuracy but its leading it to overfit zone
#so prunned the max depth after few iterations
model = RandomForestRegressor(bootstrap = True,
max_depth= 9,
max_features= 'sqrt',
min_samples_leaf= 1,
n_estimators= 100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Train data accuracy
print('Performance on training data using DT:',model.score(X_train,y_train))
# test data accuracy
print('Performance on testing data using DT:',model.score(X_test,y_test))
#r2 score
acc_DT=metrics.r2_score(y_test, y_pred)
print('Accuracy Test: ',acc_DT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
#model1 = model
results = cross_val_score(model, X_train, y_train, cv=kfold)
print(results)
print("\n Average model Accuracy: %.3f%% with std. dev - (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
modelComp=modelComp.append(pd.DataFrame({'Model':['Random Forest Regressor'],
'Train Accuracy':[model.score(X_train,y_train)],
'Test Accuracy':[model.score(X_test , y_test)],
'Kfold-Mean-Accuracy':[results.mean()],
'Kfold-StdDeviation':[results.std()]}))
modelComp
X=df_scaled.iloc[:,0:8]
y = df_scaled.iloc[:,8]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = seed)
from sklearn.decomposition import PCA
pca = PCA(8)# Initialize PCA object
pca.fit(X_train)
pca.explained_variance_
pca = PCA(n_components=6)
pca.fit(X_train)
X_train_pca = pca.transform(X_train) # PCs for the train data
X_test_pca = pca.transform(X_test) # PCs for the test data
X_train_pca.shape, X_test_pca.shape
rf = RandomForestRegressor(bootstrap = True,
max_depth= 15,
max_features= 'sqrt',
min_samples_leaf= 1,
n_estimators= 100)
rf.fit(X_train_pca, y_train)
y_pred = rf.predict(X_test_pca)
# Train data accuracy
print('Performance on training data using DT:',rf.score(X_train_pca,y_train))
# test data accuracy
print('Performance on testing data using DT:',rf.score(X_test_pca,y_test))
#r2 score
acc_DT=metrics.r2_score(y_test, y_pred)
print('Accuracy Test: ',acc_DT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
model1 = rf
results = cross_val_score(model1, X_train, y_train, cv=kfold)
print(results)
print("\n Average model Accuracy: %.3f%% with std. dev - (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
X=df_scaled.iloc[:,0:8]
y = df_scaled.iloc[:,8]
#Removing less contributing features
X=X.drop(['ash','coarseagg','fineagg'],axis=1)
# Split X and y into training and test set in 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = seed)
model=GradientBoostingRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Train data accuracy
print('Performance on training data using DT:',model.score(X_train,y_train))
# test data accuracy
print('Performance on testing data using DT:',model.score(X_test,y_test))
#r2 score
acc_DT=metrics.r2_score(y_test, y_pred)
print('Accuracy Test: ',acc_DT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
model1 = model
results = cross_val_score(model1, X_train, y_train, cv=kfold)
print(results)
print("\n Average model Accuracy: %.3f%% with std. dev - (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
parameters = {
"loss":['ls', 'lad', 'huber', 'quantile'],
"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
#"min_samples_split": np.linspace(0.1, 0.5, 5),
#"min_samples_leaf": np.linspace(0.1, 0.5, 5),
"max_depth":[3,5,8],
"max_features":["log2","sqrt"],
"criterion": ["friedman_mse", "mae"],
"subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
"n_estimators":[10]
}
clf = GridSearchCV(estimator = GradientBoostingRegressor(),
param_grid = parameters,
cv = 5,
verbose = 2,
n_jobs= 4)
clf.fit(X_train, y_train)
clf.best_params_
#Splitted the param into two reduce the time
parameters = {
"loss":['ls'],
"learning_rate": [0.2],
"min_samples_split": np.linspace(0.1, 0.5, 5),
"min_samples_leaf": np.linspace(0.1, 0.5, 5),
"max_depth":[8],
"max_features":["log2"],
"criterion": ["friedman_mse"],
"subsample":[0.9],
"n_estimators":[10,100]
}
clf = GridSearchCV(estimator = GradientBoostingRegressor(),
param_grid = parameters,
cv = 5,
verbose = 2,
n_jobs= 4)
clf.fit(X_train, y_train)
clf.best_params_
model=GradientBoostingRegressor(criterion= 'mae', learning_rate= 0.2, loss= 'huber', max_depth= 5, max_features= 'sqrt', n_estimators= 100, subsample= 0.9, min_samples_leaf=0.1, min_samples_split= 0.2,) model.fit(X_train, y_train)
model=GradientBoostingRegressor(criterion= 'friedman_mse',
learning_rate= 0.2,
loss= 'ls',
max_depth= 8,
max_features= 'log2',
n_estimators= 100,
subsample= 0.9,
min_samples_leaf=0.1,
min_samples_split= 0.2,)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Train data accuracy
print('Performance on training data using DT:',model.score(X_train,y_train))
# test data accuracy
print('Performance on testing data using DT:',model.score(X_test,y_test))
#r2 score
acc_DT=metrics.r2_score(y_test, y_pred)
print('Accuracy Test: ',acc_DT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
#model1 = model
results = cross_val_score(model, X_train, y_train, cv=kfold)
print(results)
print("\n Average model Accuracy: %.3f%% with std. dev - (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
modelComp=modelComp.append(pd.DataFrame({'Model':['GradientBoostingRegressor'],
'Train Accuracy':[model.score(X_train,y_train)],
'Test Accuracy':[model.score(X_test , y_test)],
'Kfold-Mean-Accuracy':[results.mean()],
'Kfold-StdDeviation':[results.std()]}))
modelComp
model=AdaBoostRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Train data accuracy
print('Performance on training data using DT:',model.score(X_train,y_train))
# test data accuracy
print('Performance on testing data using DT:',model.score(X_test,y_test))
#r2 score
acc_DT=metrics.r2_score(y_test, y_pred)
print('Accuracy Test: ',acc_DT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
model1 = model
results = cross_val_score(model1, X_train, y_train, cv=kfold)
print(results)
print("\n Average model Accuracy: %.3f%% with std. dev - (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
#Custom prameters
model=AdaBoostRegressor(n_estimators=100,
learning_rate=1,
loss='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Train data accuracy
print('Performance on training data using DT:',model.score(X_train,y_train))
# test data accuracy
print('Performance on testing data using DT:',model.score(X_test,y_test))
#r2 score
acc_DT=metrics.r2_score(y_test, y_pred)
print('Accuracy Test: ',acc_DT)
print('MSE: ',metrics.mean_squared_error(y_test, y_pred))
kfold = KFold(n_splits=num_folds, random_state=seed, shuffle=True)
#kfold = KFold(n_splits=num_folds)
model1 = model
results = cross_val_score(model1, X_train, y_train, cv=kfold)
print(results)
print("\n Average model Accuracy: %.3f%% with std. dev - (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))
modelComp
from sklearn.utils import resample
from sklearn.metrics import accuracy_score
from matplotlib import pyplot
data = X.join(y)
values = data.values
#Random Forest
# Number of bootstrap samples to create
n_iterations = 1000
# size of a bootstrap sample
n_size = int(len(data) * 0.5)
# run bootstrap
# empty list that will hold the scores for each bootstrap iteration
stats = list()
for i in range(n_iterations):
# prepare train and test sets
train = resample(values, n_samples=n_size) # Sampling with replacement
test = np.array([x for x in values if x.tolist() not in train.tolist()]) # picking rest of the data not considered in sample
# fit model
model = RandomForestRegressor(
bootstrap = True,
max_depth= 8,
max_features= 'sqrt',
min_samples_leaf= 1,
n_estimators= 100)
# fit against independent variables and corresponding target values
model.fit(train[:,:-1], train[:,-1])
# Take the target column for all rows in test set
y_test = test[:,-1]
# evaluate model
# predict based on independent variables in the test data
score = model.score(test[:, :-1] , y_test)
predictions = model.predict(test[:, :-1])
stats.append(score)
# plot scores
pyplot.hist(stats)
pyplot.show()
# confidence intervals
alpha = 0.95 # for 95% confidence
p = ((1.0-alpha)/2.0) * 100 # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))
#Gradient Boost
# Number of bootstrap samples to create
n_iterations = 1000
# size of a bootstrap sample
n_size = int(len(data) * 0.5)
# run bootstrap
# empty list that will hold the scores for each bootstrap iteration
stats = list()
for i in range(n_iterations):
# prepare train and test sets
train = resample(values, n_samples=n_size) # Sampling with replacement
test = np.array([x for x in values if x.tolist() not in train.tolist()]) # picking rest of the data not considered in sample
# fit model
model = GradientBoostingRegressor(criterion= 'mae',
learning_rate= 0.2,
loss= 'huber',
max_depth= 5,
max_features= 'sqrt',
n_estimators= 100,
subsample= 0.9,
min_samples_leaf=0.1,
min_samples_split= 0.2,)
# fit against independent variables and corresponding target values
model.fit(train[:,:-1], train[:,-1])
# Take the target column for all rows in test set
y_test = test[:,-1]
# evaluate model
# predict based on independent variables in the test data
score = model.score(test[:, :-1] , y_test)
predictions = model.predict(test[:, :-1])
stats.append(score)
# plot scores
pyplot.hist(stats)
pyplot.show()
# confidence intervals
alpha = 0.95 # for 95% confidence
p = ((1.0-alpha)/2.0) * 100 # tail regions on right and left .25 on each side indicated by P value (border)
lower = max(0.0, np.percentile(stats, p))
p = (alpha+((1.0-alpha)/2.0)) * 100
upper = min(1.0, np.percentile(stats, p))
print('%.1f confidence interval %.1f%% and %.1f%%' % (alpha*100, lower*100, upper*100))